Manifold learning with Feature-type distributed clustering workflow is more informative compared to UMAP for tabular clinical datasets¶

Importing necessary libraries¶

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
import umap.umap_ as umap
%config InlineBackend.figure_format = 'svg'

Importing pre-processed data¶

In [2]:
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
data=pd.read_csv('Preprocessed_DM_xx.csv')
In [3]:
np.random.seed(42)
data=data.sample(frac=1) #Shuffle the data set

Feature engineering¶

  • Creating new feature called hypertension
  • Filtering unnecessary details
In [4]:
np.random.seed(42)
HTN_indexes=data.loc[(data['Currently.taking.a.prescribed.medicine.to.lower.BP'] != 0) | (data['First.SYSTOLIC.reading'] >= 140) | (data['First.DIASTOLIC.reading'] >= 90) ].index.values
HTN_cols=np.zeros(data.shape[0])
HTN_cols[[HTN_indexes]]=1
data['HTN']=HTN_cols
data=data.drop(["First.SYSTOLIC.reading","First.DIASTOLIC.reading","Currently.taking.a.prescribed.medicine.to.lower.BP"], axis=1)
data=data.reset_index(drop=True)
data.columns
data=data.drop(["Hb_adjust_alt_smok","Second.SYSTOLIC.reading","Second.DIASTOLIC.reading","Third.SYSTOLIC.reading","Third.DIASTOLIC.reading","Hb_status","Glucose.level",'SBP_status'], axis=1)
data=data.loc[data['BMI'] != 99.99]
data=data.loc[data['Hemoglobin.level..g.dl...1.decimal.'] != 99.99]
data=data.loc[data['Currently.has.asthma'] != .5]
data=data.loc[data['Currently.has.thyroid.disorder'] != .5]
data=data.loc[data['Currently.has.heart.disease'] != .5]
data=data.loc[data['Currently.has.cancer'] != .5]
data=data.loc[data['DM_history'] == 1]
data=data.loc[data['Type.of.caste.or.tribe.of.the.household.head'] != 0]
data=data.loc[data['Time.to.get.to.water.source..minutes.'] != -1]
data=data.drop(["Unnamed: 0","DM_status","DM_history"], axis=1)
In [5]:
np.random.seed(42)
i=[x for x in range(10125)]

data.set_index(pd.Series(i), inplace=True) # Reset the index
In [6]:
data.shape
Out[6]:
(10125, 41)
In [7]:
ord_list=['Drinks.alcohol', 'Smoking_stat','Has.refrigerator',
       'Has.bicycle', 'Has.motorcycle.scooter', 'Has.car.truck', 'Owns.livestock..herds.or.farm.animals','Frequency.takes.milk.or.curd',
       'Frequency.eats.pulses.or.beans',
       'Frequency.eats.dark.green.leafy.vegetable', 'Frequency.eats.fruits',
       'Frequency.eats.eggs', 'Frequency.eats.fish',
       'Frequency.eats.chicken.or.meat', 'Frequency.eats.fried.food',
       'Frequency.takes.aerated.drinks','Frequency.household.members.smoke.inside.the.house','Wealth.index',
       'Highest.educational.level','Currently.has.asthma',
       'Currently.has.thyroid.disorder', 'Currently.has.heart.disease',
       'Currently.has.cancer', 'Suffers.from.TB','HTN' ]
cont_list=['Current.age','BMI','Hemoglobin.level..g.dl...1.decimal.','Time.to.get.to.water.source..minutes.']
nom_list=['Household.head.s.religion', 'Sex', 'Type.of.place.of.residence', 'Household.structure',
       'Type.of.caste.or.tribe.of.the.household.head','Type.of.cooking.fuel','Source.of.drinking.water']

UMAP on original data¶

In [8]:
from fdc.fdc import feature_clustering
from fdc.fdc import canberra_modified
modified_can = canberra_modified
from fdc.fdc import FDC, Clustering
In [9]:
umap_emb=feature_clustering(15,0.1,'euclidean',data,True)
2023-06-22T19:38:26.178552 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/

ANOVA test, Silhouette_score and Dunn index for umap clusters extracted using K-means clustering¶

In [10]:
from fdc.clustering import Clustering
In [11]:
umap_clustering=Clustering(umap_emb,umap_emb,True)
umap_cluster_list,umap_cluster_counts=umap_clustering.K_means(2)
2023-06-22T19:38:32.464673 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [12]:
### ANOVA TEST


data['Cluster']=umap_cluster_list
test_results={}
c_names=data.columns
from scipy.stats import f_oneway
for i in c_names[:-1]:
    for j in range(len(np.unique(data.Cluster))):#add -1 if there is noise otherwise not necessary------applicable only for DBSCAN
        var_name="c_"+str(j)
        myVars = vars()
        myVars[var_name]=list(data[i][data["Cluster"]==j])
    
    stats,p_val=f_oneway(c_0,c_1)
    test_results[i]=stats,float("{:.4f}".format(p_val))

test_results=pd.DataFrame.from_dict(test_results, orient='index', columns=["stats","p_value"])
test_column_list=test_results.index[test_results["p_value"]<0.05].to_list()
ord_count=nom_count=cont_count=0
for i in test_column_list:
    if i in ord_list:
        ord_count+=1
    elif i in nom_list:
        nom_count+=1
    elif i in cont_list:
        cont_count+=1
print('percentage of all features having p-value less than 0.05: ',float("{:.2f}".format((len(test_column_list)/(len(c_names)-1))*100)),'%')
print('percentage of ordinal features having p-value less than 0.05: ',(ord_count/len(ord_list))*100,'%')
print('percentage of  nominal features having p-value less than 0.05: ',(nom_count/len(nom_list))*100,'%')
print('percentage of continous features having p-value less than 0.05: ',(cont_count/len(cont_list))*100,'%')
percentage of all features having p-value less than 0.05:  68.29 %
percentage of ordinal features having p-value less than 0.05:  64.0 %
percentage of  nominal features having p-value less than 0.05:  100.0 %
percentage of continous features having p-value less than 0.05:  75.0 %
In [13]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
In [14]:
silhouette_score(umap_emb, umap_cluster_list, metric='euclidean')
Out[14]:
0.5998863206501435

Visualizing Silhouette score (you can also choose the number of clusters based on score)¶

In [15]:
from cluster_val import *
In [16]:
Silhouette_visual(umap_emb)
2023-06-22T19:39:03.290419 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [17]:
elbow_plot(umap_emb)
2023-06-22T19:39:15.007116 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [18]:
dunn_index(cluster_wise_df(umap_emb,umap_cluster_list))
Out[18]:
0.00287162771413835

ANOVA test, Silhouette_score and Dunn index for umap clusters extracted using Agglomerative clustering¶

In [19]:
umap_cluster_list_agglo,umap_cluster_counts_agglo=umap_clustering.Agglomerative(2,'euclidean','ward')
2023-06-22T19:40:59.200193 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [20]:
### ANOVA TEST

data['Cluster']=umap_cluster_list_agglo
test_results={}
c_names=data.columns
from scipy.stats import f_oneway
for i in c_names[:-1]:
    for j in range(len(np.unique(data.Cluster))):#add -1 if there is noise otherwise not necessary------applicable only for DBSCAN
        var_name="c_"+str(j)
        myVars = vars()
        myVars[var_name]=list(data[i][data["Cluster"]==j])
    
    stats,p_val=f_oneway(c_0,c_1)
    test_results[i]=stats,float("{:.4f}".format(p_val))

test_results=pd.DataFrame.from_dict(test_results, orient='index', columns=["stats","p_value"])
test_column_list=test_results.index[test_results["p_value"]<0.05].to_list()
ord_count=nom_count=cont_count=0
for i in test_column_list:
    if i in ord_list:
        ord_count+=1
    elif i in nom_list:
        nom_count+=1
    elif i in cont_list:
        cont_count+=1
print('percentage of all features having p-value less than 0.05: ',float("{:.2f}".format((len(test_column_list)/(len(c_names)-1))*100)),'%')
print('percentage of ordinal features having p-value less than 0.05: ',(ord_count/len(ord_list))*100,'%')
print('percentage of  nominal features having p-value less than 0.05: ',(nom_count/len(nom_list))*100,'%')
print('percentage of continous features having p-value less than 0.05: ',(cont_count/len(cont_list))*100,'%')
percentage of all features having p-value less than 0.05:  70.73 %
percentage of ordinal features having p-value less than 0.05:  68.0 %
percentage of  nominal features having p-value less than 0.05:  100.0 %
percentage of continous features having p-value less than 0.05:  75.0 %
In [21]:
silhouette_score(umap_emb, umap_cluster_list_agglo, metric='euclidean')
Out[21]:
0.6148981607281019
In [22]:
dunn_index(cluster_wise_df(umap_emb,umap_cluster_list_agglo))
Out[22]:
0.2598093251538023

ANOVA test, Silhouette_score and Dunn index for umap clusters extracted using DBSCAN clustering¶

In [23]:
umap_cluster_list_dbscan,umap_cluster_counts_dbscan=umap_clustering.DBSCAN(0.8,160)
2023-06-22T19:42:49.837996 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [24]:
### ANOVA TEST

data['Cluster']=umap_cluster_list_dbscan
test_results={}
c_names=data.columns
from scipy.stats import f_oneway
for i in c_names[:-1]:
    for j in range(len(np.unique(data.Cluster))-1):#add -1 if there is noise otherwise not necessary------applicable only for DBSCAN
        var_name="c_"+str(j)
        myVars = vars()
        myVars[var_name]=list(data[i][data["Cluster"]==j])
    
    stats,p_val=f_oneway(c_0,c_1)
    test_results[i]=stats,float("{:.4f}".format(p_val))

test_results=pd.DataFrame.from_dict(test_results, orient='index', columns=["stats","p_value"])
test_column_list=test_results.index[test_results["p_value"]<0.05].to_list()
ord_count=nom_count=cont_count=0
for i in test_column_list:
    if i in ord_list:
        ord_count+=1
    elif i in nom_list:
        nom_count+=1
    elif i in cont_list:
        cont_count+=1
print('percentage of all features having p-value less than 0.05: ',float("{:.2f}".format((len(test_column_list)/(len(c_names)-1))*100)),'%')
print('percentage of ordinal features having p-value less than 0.05: ',(ord_count/len(ord_list))*100,'%')
print('percentage of  nominal features having p-value less than 0.05: ',(nom_count/len(nom_list))*100,'%')
print('percentage of continous features having p-value less than 0.05: ',(cont_count/len(cont_list))*100,'%')
percentage of all features having p-value less than 0.05:  70.73 %
percentage of ordinal features having p-value less than 0.05:  68.0 %
percentage of  nominal features having p-value less than 0.05:  100.0 %
percentage of continous features having p-value less than 0.05:  75.0 %
In [25]:
#removing noise indices from the embeddings
non_noise_indices= np.where(np.array(umap_cluster_list_dbscan)!=-1)
umap_emb= umap_emb.iloc[non_noise_indices]
#FDC_emb_low= FDC_emb_low.iloc[non_noise_indices]
umap_cluster_list_dbscan= np.array(umap_cluster_list_dbscan)[non_noise_indices]
In [26]:
silhouette_score(umap_emb, umap_cluster_list_dbscan, metric='euclidean')
Out[26]:
0.6153916171327859
In [27]:
dunn_index(cluster_wise_df(umap_emb,umap_cluster_list_dbscan))
Out[27]:
0.2598093251538023

Dividing features¶

  • ord_list=ordinal features
  • cont_list=continueous features
  • nom_list=nominal features
In [28]:
ord_list=['Drinks.alcohol', 'Smoking_stat','Has.refrigerator',
       'Has.bicycle', 'Has.motorcycle.scooter', 'Has.car.truck', 'Owns.livestock..herds.or.farm.animals','Frequency.takes.milk.or.curd',
       'Frequency.eats.pulses.or.beans',
       'Frequency.eats.dark.green.leafy.vegetable', 'Frequency.eats.fruits',
       'Frequency.eats.eggs', 'Frequency.eats.fish',
       'Frequency.eats.chicken.or.meat', 'Frequency.eats.fried.food',
       'Frequency.takes.aerated.drinks','Frequency.household.members.smoke.inside.the.house','Wealth.index',
       'Highest.educational.level','Currently.has.asthma',
       'Currently.has.thyroid.disorder', 'Currently.has.heart.disease',
       'Currently.has.cancer', 'Suffers.from.TB','HTN' ]
cont_list=['Current.age','BMI','Hemoglobin.level..g.dl...1.decimal.','Time.to.get.to.water.source..minutes.']
nom_list=['Household.head.s.religion', 'Sex', 'Type.of.place.of.residence', 'Household.structure',
       'Type.of.caste.or.tribe.of.the.household.head','Type.of.cooking.fuel','Source.of.drinking.water']
In [29]:
len(ord_list)
Out[29]:
25
In [30]:
len(nom_list)
Out[30]:
7
In [31]:
len(cont_list)
Out[31]:
4

FDC on original data¶

In [32]:
from fdc.fdc import feature_clustering
from fdc.fdc import FDC, Clustering
from fdc.fdc import canberra_modified
modified_can = canberra_modified
In [33]:
fdc = FDC(clustering_cont=Clustering('euclidean', 15, 0.1)
          , clustering_ord=Clustering(modified_can, 15, 0.1)
          , clustering_nom=Clustering('hamming', 15, 0.1, max_components=1)
          , visual=True
          , use_pandas_output=True
          , with_2d_embedding=True
          )

fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)

FDC_emb_high,FDC_emb_low = fdc.normalize(data,n_neighbors=15, min_dist=0.1,cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
                  with_2d_embedding=True,
                  visual=True)
FDC.normalize (init): 0.00000 / 0.000s
FDC.normalize (clustering CONT): 15.39062 / 15.391s
FDC.normalize (clustering ORD): 104.07812 / 119.469s
FDC.normalize (clustering NOM): 77.57812 / 197.047s
FDC.normalize (concat): 0.00000 / 197.047s
FDC.normalize (umap 5 -> 2): 8.82812 / 205.875s
FDC.normalize (array -> DataFrame): 0.00000 / 205.875s
2023-06-22T19:49:45.170349 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
FDC.normalize (plotting): 1.57812 / 207.453s
FDC.normalize (array -> DataFrame): 0.00000 / 207.453s
FDC.normalize (total): 0.00000 / 207.453s

ANOVA test, Silhouette_score and Dunn index for FDC clusters extracted using K-means clustering¶

In [34]:
from fdc.clustering import Clustering
In [35]:
FDC_emb_low.rename(columns={"UMAP_0": "FDC_0", "UMAP_1": "FDC_1"},inplace=True)
In [36]:
clustering=Clustering(FDC_emb_low,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)
2023-06-22T19:49:48.784747 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [37]:
### ANOVA TEST

data['Cluster']=cluster_list
test_results={}
c_names=data.columns
from scipy.stats import f_oneway
for i in c_names[:-1]:
    for j in range(len(np.unique(data.Cluster))):#add -1 if there is noise otherwise not necessary------applicable only for DBSCAN
        var_name="c_"+str(j)
        myVars = vars()
        myVars[var_name]=list(data[i][data["Cluster"]==j])
    
    stats,p_val=f_oneway(c_0,c_1,c_2,c_3)
    test_results[i]=stats,float("{:.4f}".format(p_val))

test_results=pd.DataFrame.from_dict(test_results, orient='index', columns=["stats","p_value"])
test_column_list=test_results.index[test_results["p_value"]<0.05].to_list()
ord_count=nom_count=cont_count=0
for i in test_column_list:
    if i in ord_list:
        ord_count+=1
    elif i in nom_list:
        nom_count+=1
    elif i in cont_list:
        cont_count+=1
print('percentage of all features having p-value less than 0.05: ',float("{:.2f}".format((len(test_column_list)/(len(c_names)-1))*100)),'%')
print('percentage of ordinal features having p-value less than 0.05: ',(ord_count/len(ord_list))*100,'%')
print('percentage of  nominal features having p-value less than 0.05: ',(nom_count/len(nom_list))*100,'%')
print('percentage of continous features having p-value less than 0.05: ',(cont_count/len(cont_list))*100,'%')
percentage of all features having p-value less than 0.05:  95.12 %
percentage of ordinal features having p-value less than 0.05:  92.0 %
percentage of  nominal features having p-value less than 0.05:  100.0 %
percentage of continous features having p-value less than 0.05:  100.0 %
In [38]:
silhouette_score(FDC_emb_low, cluster_list, metric='euclidean')
Out[38]:
0.6305214573899908
In [39]:
dunn_index(cluster_wise_df(FDC_emb_low,cluster_list))
Out[39]:
0.47412534436557996
In [40]:
elbow_plot(FDC_emb_low)
2023-06-22T19:51:29.528125 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/

Visualizing Shilhouette score for low_dim fdc embedding¶

In [41]:
Silhouette_visual(FDC_emb_low)
2023-06-22T19:52:01.500486 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/

ANOVA test, Silhouette_score and Dunn index for FDC clusters extracted using Agglomerative clustering¶

In [42]:
cluster_list_agglo,cluster_counts_agglo=clustering.Agglomerative(4,'euclidean','ward')
2023-06-22T19:52:05.769315 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [43]:
### ANOVA TEST

data['Cluster']=cluster_list_agglo
test_results={}
c_names=data.columns
from scipy.stats import f_oneway
for i in c_names[:-1]:
    for j in range(len(np.unique(data.Cluster))):#add -1 if there is noise otherwise not necessary------applicable only for DBSCAN
        var_name="c_"+str(j)
        myVars = vars()
        myVars[var_name]=list(data[i][data["Cluster"]==j])
    
    stats,p_val=f_oneway(c_0,c_1,c_2,c_3)
    test_results[i]=stats,float("{:.4f}".format(p_val))

test_results=pd.DataFrame.from_dict(test_results, orient='index', columns=["stats","p_value"])
test_column_list=test_results.index[test_results["p_value"]<0.05].to_list()
ord_count=nom_count=cont_count=0
for i in test_column_list:
    if i in ord_list:
        ord_count+=1
    elif i in nom_list:
        nom_count+=1
    elif i in cont_list:
        cont_count+=1
print('percentage of all features having p-value less than 0.05: ',float("{:.2f}".format((len(test_column_list)/(len(c_names)-1))*100)),'%')
print('percentage of ordinal features having p-value less than 0.05: ',(ord_count/len(ord_list))*100,'%')
print('percentage of  nominal features having p-value less than 0.05: ',(nom_count/len(nom_list))*100,'%')
print('percentage of continous features having p-value less than 0.05: ',(cont_count/len(cont_list))*100,'%')
percentage of all features having p-value less than 0.05:  95.12 %
percentage of ordinal features having p-value less than 0.05:  92.0 %
percentage of  nominal features having p-value less than 0.05:  100.0 %
percentage of continous features having p-value less than 0.05:  100.0 %
In [44]:
silhouette_score(FDC_emb_low, cluster_list_agglo, metric='euclidean')
Out[44]:
0.6221553723177761
In [45]:
dunn_index(cluster_wise_df(FDC_emb_low,cluster_list_agglo))
Out[45]:
0.33757502489285046

ANOVA test, Silhouette_score and Dunn index for FDC clusters extracted using DBSCAN clustering¶

In [46]:
cluster_list_dbscan,cluster_counts_dbscan=clustering.DBSCAN(0.95,260)
2023-06-22T19:53:37.249177 image/svg+xml Matplotlib v3.6.3, https://matplotlib.org/
In [47]:
### ANOVA TEST


data['Cluster']=cluster_list_dbscan
test_results={}
c_names=data.columns
from scipy.stats import f_oneway
for i in c_names[:-1]:
    for j in range(len(np.unique(data.Cluster))-1):#add -1 if there is noise otherwise not necessary------applicable only for DBSCAN
        var_name="c_"+str(j)
        myVars = vars()
        myVars[var_name]=list(data[i][data["Cluster"]==j])
    
    stats,p_val=f_oneway(c_0,c_1,c_2,c_3,c_4)
    test_results[i]=stats,float("{:.4f}".format(p_val))

test_results=pd.DataFrame.from_dict(test_results, orient='index', columns=["stats","p_value"])
test_column_list=test_results.index[test_results["p_value"]<0.05].to_list()
ord_count=nom_count=cont_count=0
for i in test_column_list:
    if i in ord_list:
        ord_count+=1
    elif i in nom_list:
        nom_count+=1
    elif i in cont_list:
        cont_count+=1
print('percentage of all features having p-value less than 0.05: ',float("{:.2f}".format((len(test_column_list)/(len(c_names)-1))*100)),'%')
print('percentage of ordinal features having p-value less than 0.05: ',(ord_count/len(ord_list))*100,'%')
print('percentage of  nominal features having p-value less than 0.05: ',(nom_count/len(nom_list))*100,'%')
print('percentage of continous features having p-value less than 0.05: ',(cont_count/len(cont_list))*100,'%')
percentage of all features having p-value less than 0.05:  90.24 %
percentage of ordinal features having p-value less than 0.05:  84.0 %
percentage of  nominal features having p-value less than 0.05:  100.0 %
percentage of continous features having p-value less than 0.05:  100.0 %
In [48]:
#removing noise indices from the embeddings
non_noise_indices= np.where(np.array(cluster_list_dbscan)!=-1)
FDC_emb_high= FDC_emb_high.iloc[non_noise_indices]
FDC_emb_low= FDC_emb_low.iloc[non_noise_indices]
cluster_list_dbscan= np.array(cluster_list_dbscan)[non_noise_indices]
In [49]:
silhouette_score(FDC_emb_low, cluster_list_dbscan, metric='euclidean')
Out[49]:
0.6771768433382325
In [50]:
dunn_index(cluster_wise_df(FDC_emb_low,cluster_list_dbscan))
Out[50]:
0.19018725042006301